{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "import constants.consts as consts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_num_inters(rel_dict):\n",
    "    count = 0\n",
    "    for key,vals in rel_dict.items():\n",
    "        count += len(vals)\n",
    "    return count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_path = 'data/' + consts.SONG_IX_DATA_DIR\n",
    "with open(data_path + 'rs_ix_song_person.dict', 'rb') as handle:\n",
    "    rs_song_person = pickle.load(handle)\n",
    "with open(data_path + 'rs_ix_person_song.dict', 'rb') as handle:\n",
    "    rs_person_song = pickle.load(handle)\n",
    "with open(data_path + 'rs_ix_song_user.dict', 'rb') as handle:\n",
    "    rs_song_user = pickle.load(handle)\n",
    "with open(data_path + 'rs_ix_user_song.dict', 'rb') as handle:\n",
    "    rs_user_song = pickle.load(handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"random sample 10% of KKBox\")\n",
    "num_users = len(rs_user_song.keys())\n",
    "num_songs = len(set(rs_song_user.keys()) | set(rs_song_person.keys()))\n",
    "num_persons = len(rs_person_song.keys())\n",
    "print('# users:', num_users)\n",
    "print('# songs:',num_songs)\n",
    "print('# persons:',num_persons)\n",
    "\n",
    "num_inters = get_num_inters(rs_user_song)\n",
    "num_kg_connections = num_inters + get_num_inters(rs_person_song)\n",
    "\n",
    "print('# interactions:', num_inters)\n",
    "print('# KG connections:', num_kg_connections)\n",
    "\n",
    "print('# entities:', num_users + num_songs + num_persons)\n",
    "\n",
    "print()\n",
    "print('Interactions per user:', num_inters / num_users)\n",
    "print('Interactions per song:', num_inters / num_songs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_path = 'data/' + consts.SONG_IX_DATA_DIR\n",
    "with open(data_path + 'dense_ix_song_person.dict', 'rb') as handle:\n",
    "    dense_song_person = pickle.load(handle)\n",
    "with open(data_path + 'dense_ix_person_song.dict', 'rb') as handle:\n",
    "    dense_person_song = pickle.load(handle)\n",
    "with open(data_path + 'dense_ix_song_user.dict', 'rb') as handle:\n",
    "    dense_song_user = pickle.load(handle)\n",
    "with open(data_path + 'dense_ix_user_song.dict', 'rb') as handle:\n",
    "    dense_user_song = pickle.load(handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"top 10% of KKBox via density\")\n",
    "num_users = len(dense_user_song.keys())\n",
    "num_songs = len(set(dense_song_user.keys()) | set(rs_song_person.keys()))\n",
    "num_persons = len(dense_person_song.keys())\n",
    "print('# users:', num_users)\n",
    "print('# songs:',num_songs)\n",
    "print('# persons:',num_persons)\n",
    "\n",
    "num_inters = get_num_inters(dense_user_song)\n",
    "num_kg_connections = num_inters + get_num_inters(dense_person_song)\n",
    "\n",
    "print('# interactions:', num_inters)\n",
    "print('# KG connections:', num_kg_connections)\n",
    "\n",
    "print('# entities:', num_users + num_songs + num_persons)\n",
    "\n",
    "print()\n",
    "print('Interactions per user:', num_inters / num_users)\n",
    "print('Interactions per song:', num_inters / num_songs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#for KKBox original paper has about 88 interactions per user, 1.6 interactions per song\n",
    "#MI has about 166 interactions per user, 258 interactions per song\n",
    "\n",
    "#Also note process for sampling was taking 10% of users, songs, persons, but only those connected to other entities\n",
    "#in our KG actually get added to a dictionary. SO number of things in rs and dense vary.\n",
    "# num relation types=5 (paper was 6 so maybe counting unknown relation?)\n",
    "\n",
    "#How best to compute path statistics?\n",
    "#Could do avg path length within test and train data: both sampled and not sampled version\n",
    "#Can also do avg number of paths within non sampled version"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
